Introduction

The goal of this project is to utilize statistical matching techniques to search for a subset of the prerelease user population that is most representative of Release.

In this first report, we will perform an exploratory analysis of the data, focusing on investigating the differences between Beta and Release users.

Loading the data

## Loading the training dataset

load("~/GitHub/ff-beta-release-matching/poc/EDA/data_milestone2_df_train_validate_20191025.RData")

Training

  • App Version 67
## View train dataframe 

kable(head(df_train_f)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")
client_id num_active_days content_crashes active_hours uri_count session_length search_count num_bookmarks num_pages daily_unique_domains daily_max_tabs daily_tabs_opened startup_ms daily_num_sessions_started active_hours_max uri_count_max session_length_max search_count_max num_pages_max daily_unique_domains_max daily_max_tabs_max daily_tabs_opened_max startup_ms_max daily_num_sessions_started_max label install_year profile_age fxa_configured sync_configured is_default_browser locale normalized_channel app_version default_search_engine country timezone_offset num_addons cpu_cores cpu_speed_mhz cpu_l2_cache_kb cpu_vendor memory_mb os_version is_wow64 FX_PAGE_LOAD_MS_2_PARENT TIME_TO_DOM_COMPLETE_MS TIME_TO_DOM_CONTENT_LOADED_END_MS TIME_TO_LOAD_EVENT_END_MS TIME_TO_DOM_INTERACTIVE_MS TIME_TO_NON_BLANK_PAINT_MS profile_age_cat distro_id_norm timezone_cat memory_cat cpu_speed_cat cpu_cores_cat is_release cpu_l2_cache_kb_cat
001cf926-92e3-4587-887e-d3156ba24d82 8 0 1.4215278 76.1250 22.9337499 1.875 11.00 4014.5 6.062500 7.00000 15.00 54176.2500 0.6250000 3.3208333 139 37.007500 6 15464 17.000000 11 29 180095 2 beta 2016 1160 False False True en-US beta 67 DuckDuckGo US -240 8 2 2527 256 Intel 4022 6.1 False 4223.089 5220.036 9079.136 5221.752 6157.840 5198.1300 < 5 years Mozilla (-6,-4] < 4GB < 3GHz 2 FALSE < 256
00210163-2123-427e-bb73-398bda9f9eba 5 0 0.8305556 168.2000 2.4390556 0.800 248.75 20599.5 7.866667 3.40000 14.80 3164.0667 1.2000000 1.6708333 325 8.318333 3 20719 17.000000 4 33 4966 2 beta 2016 1079 False False False en-US beta 67 DuckDuckGo GB 60 6 2 2394 256 Intel 3810 6.1 False 2148.350 2253.526 1159.979 2146.827 1155.050 1015.9784 < 5 years Mozilla (0,2] < 4GB < 3GHz 2 FALSE < 256
0024fd24-4ef5-4771-850a-9e3846597015 2 0 0.5111111 82.0000 0.8712505 2.500 9.00 87.0 2.166667 4.00000 8.00 23977.9444 5.0000000 0.8250000 145 1.464445 5 87 3.333333 6 15 31918 9 beta 2019 745 False False True en-US beta 67 Google GB 60 6 4 2394 256 Intel 8124 10.0 False 2699.834 2216.994 1832.642 2119.277 1819.612 1979.7910 < 5 years Mozilla (0,2] < 16GB < 3GHz < 4 FALSE < 256
004f70f7-2576-4de5-94b4-5bf1acdca0a8 8 0 0.3946181 101.8750 4.6785415 7.750 87.00 8882.0 7.750000 9.25000 19.50 1703.3125 1.3750000 1.1625000 210 9.174722 10 9044 13.000000 12 37 3454 2 beta 2018 130 True True True en-US beta 67 Google US -240 10 4 3991 256 Intel 16235 10.0 False 2370.563 2368.195 1614.073 2356.307 1497.652 850.0551 < 6 months Mozilla (-6,-4] < 16GB < 4GHz < 4 FALSE < 256
007c0c11-38e4-476b-a494-d732e15ac159 4 0 0.5930556 167.5000 5.6081245 0.250 13.00 3024.0 1.775000 8.75000 13.75 15285.5375 2.2500000 0.8791667 255 9.059722 1 3552 2.500000 27 36 24592 3 beta 2018 293 False False False en-US beta 67 Google US 360 7 2 2659 3072 Intel 3317 10.0 False 4050.417 5106.575 3630.348 5041.016 3618.008 2006.3376 < 2 years Mozilla (4,6] < 4GB < 3GHz 2 FALSE > 1024
0294837f-c98f-44ab-8237-30d2eba6c55a 6 0 1.6333333 165.8333 32.4127778 5.500 17.00 7740.7 8.583333 10.33333 20.00 975.2222 0.1666667 3.2319444 323 41.553611 12 7975 13.000000 18 35 1088 1 beta 2019 502 False False True en-US beta 67 other (non-bundled) GB 60 7 4 1800 256 Intel 8026 10.0 False 2740.793 2170.362 1482.620 2033.104 1161.901 1034.0580 < 2 years Mozilla (0,2] < 16GB < 2GHz < 4 FALSE < 256

Validation

  • App Version 68
## View train dataframe 

kable(head(df_validate_f)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")
client_id num_active_days content_crashes active_hours uri_count session_length search_count num_bookmarks num_pages daily_unique_domains daily_max_tabs daily_tabs_opened startup_ms daily_num_sessions_started active_hours_max uri_count_max session_length_max search_count_max num_pages_max daily_unique_domains_max daily_max_tabs_max daily_tabs_opened_max startup_ms_max daily_num_sessions_started_max label install_year profile_age fxa_configured sync_configured is_default_browser locale normalized_channel app_version default_search_engine country timezone_offset num_addons cpu_cores cpu_speed_mhz cpu_l2_cache_kb cpu_vendor memory_mb os_version is_wow64 FX_PAGE_LOAD_MS_2_PARENT TIME_TO_DOM_COMPLETE_MS TIME_TO_DOM_CONTENT_LOADED_END_MS TIME_TO_LOAD_EVENT_END_MS TIME_TO_DOM_INTERACTIVE_MS TIME_TO_NON_BLANK_PAINT_MS profile_age_cat distro_id_norm timezone_cat memory_cat cpu_speed_cat cpu_cores_cat is_release cpu_l2_cache_kb_cat
001cf926-92e3-4587-887e-d3156ba24d82 8 0 1.5369792 71.12500 23.6797916 3.375000 11 1890.571 9.216667 8.500000 16.625 9928.483 0.6250000 2.3250000 120 34.020000 6 2094 20 14 31 17491.667 3 beta 2016 1204 False False True en-US beta 68 DuckDuckGo US -240 7 2 2527 256 Intel 4022 6.1 False 3133.947 3713.308 3748.471 3730.944 2444.799 1972.6632 < 5 years Mozilla (-6,-4] < 4GB < 3GHz 2 FALSE < 256
00210163-2123-427e-bb73-398bda9f9eba 2 0 0.1833333 43.50000 0.5619445 1.500000 259 22005.000 5.000000 3.000000 6.000 5413.500 1.0000000 0.2152778 48 0.781111 2 22005 7 4 7 9579.000 1 beta 2016 1124 False False False en-US beta 68 DuckDuckGo GB 60 5 2 2394 256 Intel 3810 6.1 False 3226.048 2561.596 1346.836 2523.810 1385.350 935.6222 < 5 years Mozilla (0,2] < 4GB < 3GHz 2 FALSE < 256
007c0c11-38e4-476b-a494-d732e15ac159 2 0 0.2423611 89.00000 6.6299995 0.000000 15 7203.000 1.000000 6.500000 9.500 7041.667 1.0000000 0.3222222 99 9.959722 0 7203 1 11 12 9194.333 2 beta 2018 336 False False False en-US beta 68 Google US 360 6 2 2659 3072 Intel 3317 10.0 False 4400.155 7244.930 3711.445 7280.457 3929.289 1474.5607 < 2 years Mozilla (4,6] < 4GB < 3GHz 2 FALSE > 1024
009ca4e9-874a-4c3e-983d-af0923346efb 3 0 0.1365741 29.66667 0.2358330 0.000000 11 2112.000 1.666667 1.666667 1.000 4644.333 1.6666667 0.1958333 53 0.387222 0 2112 2 2 1 6964.500 2 beta 2013 1606 False False True en-US beta 68 Google GB 360 6 2 2594 256 Intel 3965 6.2 False 5909.683 11043.408 5398.452 10410.288 5860.450 4538.1379 < 5 years Mozilla (4,6] < 4GB < 3GHz 2 FALSE < 256
0101d568-0c63-4492-9295-ed57ef78207f 3 0 0.3435185 15.66667 24.0802773 0.000000 7 17.000 1.000000 1.666667 1.500 4214.833 0.3333333 0.5083333 19 39.200277 0 18 1 2 2 4215.000 1 beta 2017 6 False False False en-US beta 68 Google US -420 5 4 3093 256 Intel 16274 6.3 True 2306.655 3225.400 2761.783 3493.289 2141.391 2823.6923 < 1 week Mozilla (-8,-6] < 16GB < 4GHz < 4 FALSE < 256
0159675e-15b0-4443-85b1-94de65455636 6 0 0.0946759 18.50000 10.6411113 1.166667 7 115.500 2.666667 3.166667 4.000 2427.250 1.1666667 0.1944444 33 29.938334 2 137 5 6 7 5234.000 2 beta 2019 17 False False False en-US beta 68 Google US -240 5 4 3292 256 Intel 8098 10.0 True 5100.212 6103.080 3665.146 6118.852 3707.192 3391.5286 < 1 month Mozilla (-6,-4] < 16GB < 4GHz < 4 FALSE < 256

Data Inspection

Training

To get introduced to our training dataset, let’s have a look on the basic information of the dataset.

rows columns discrete_columns continuous_columns all_missing_columns total_missing_values complete_rows total_observations memory_usage
302819 58 20 38 0 0 302819 17563502 135686992

Validation

To get introduced to our validation dataset, let’s have a look on the basic information of the dataset.

rows columns discrete_columns continuous_columns all_missing_columns total_missing_values complete_rows total_observations memory_usage
328042 58 20 38 0 0 328042 19026436 146987912

Observations

  • In both datasets, most variables are continuous
  • No NAs value (were handled in preprocessing)

Data Structure

Training

Let’s use glimpse function to display a vertical preview of the training dataset. So we can easily preview data type and sample data.

glimpse(df_train_f)
## Observations: 302,819
## Variables: 58
## $ client_id                         <chr> "001cf926-92e3-4587-887e-d31...
## $ num_active_days                   <int> 8, 5, 2, 8, 4, 6, 8, 4, 3, 5...
## $ content_crashes                   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ active_hours                      <dbl> 1.42152778, 0.83055556, 0.51...
## $ uri_count                         <dbl> 76.12500, 168.20000, 82.0000...
## $ session_length                    <dbl> 22.93374988, 2.43905560, 0.8...
## $ search_count                      <dbl> 1.875000, 0.800000, 2.500000...
## $ num_bookmarks                     <dbl> 11.00, 248.75, 9.00, 87.00, ...
## $ num_pages                         <dbl> 4014.5000, 20599.5000, 87.00...
## $ daily_unique_domains              <dbl> 6.062500, 7.866667, 2.166667...
## $ daily_max_tabs                    <dbl> 7.000000, 3.400000, 4.000000...
## $ daily_tabs_opened                 <dbl> 15.000000, 14.800000, 8.0000...
## $ startup_ms                        <dbl> 54176.2500, 3164.0667, 23977...
## $ daily_num_sessions_started        <dbl> 0.6250000, 1.2000000, 5.0000...
## $ active_hours_max                  <dbl> 3.3208333, 1.6708333, 0.8250...
## $ uri_count_max                     <int> 139, 325, 145, 210, 255, 323...
## $ session_length_max                <dbl> 37.007500, 8.318333, 1.46444...
## $ search_count_max                  <int> 6, 3, 5, 10, 1, 12, 29, 0, 0...
## $ num_pages_max                     <dbl> 15464.0, 20719.0, 87.0, 9044...
## $ daily_unique_domains_max          <dbl> 17.000000, 17.000000, 3.3333...
## $ daily_max_tabs_max                <int> 11, 4, 6, 12, 27, 18, 18, 2,...
## $ daily_tabs_opened_max             <int> 29, 33, 15, 37, 36, 35, 170,...
## $ startup_ms_max                    <dbl> 180095.000, 4966.000, 31918....
## $ daily_num_sessions_started_max    <int> 2, 2, 9, 2, 3, 1, 6, 1, 2, 3...
## $ label                             <fct> beta, beta, beta, beta, beta...
## $ install_year                      <dbl> 2016, 2016, 2019, 2018, 2018...
## $ profile_age                       <dbl> 1160, 1079, 745, 130, 293, 5...
## $ fxa_configured                    <fct> False, False, False, True, F...
## $ sync_configured                   <fct> False, False, False, True, F...
## $ is_default_browser                <fct> True, False, True, True, Fal...
## $ locale                            <fct> en-US, en-US, en-US, en-US, ...
## $ normalized_channel                <fct> beta, beta, beta, beta, beta...
## $ app_version                       <dbl> 67, 67, 67, 67, 67, 67, 67, ...
## $ default_search_engine             <fct> DuckDuckGo, DuckDuckGo, Goog...
## $ country                           <fct> US, GB, GB, US, US, GB, GB, ...
## $ timezone_offset                   <int> -240, 60, 60, -240, 360, 60,...
## $ num_addons                        <dbl> 8.00, 6.00, 6.00, 10.00, 7.0...
## $ cpu_cores                         <dbl> 2, 2, 4, 4, 2, 4, 2, 2, 2, 2...
## $ cpu_speed_mhz                     <dbl> 2527, 2394, 2394, 3991, 2659...
## $ cpu_l2_cache_kb                   <dbl> 256, 256, 256, 256, 3072, 25...
## $ cpu_vendor                        <fct> Intel, Intel, Intel, Intel, ...
## $ memory_mb                         <int> 4022, 3810, 8124, 16235, 331...
## $ os_version                        <ord> 6.1, 6.1, 10.0, 10.0, 10.0, ...
## $ is_wow64                          <fct> False, False, False, False, ...
## $ FX_PAGE_LOAD_MS_2_PARENT          <dbl> 4223.0885, 2148.3495, 2699.8...
## $ TIME_TO_DOM_COMPLETE_MS           <dbl> 5220.036, 2253.526, 2216.994...
## $ TIME_TO_DOM_CONTENT_LOADED_END_MS <dbl> 9079.1364, 1159.9791, 1832.6...
## $ TIME_TO_LOAD_EVENT_END_MS         <dbl> 5221.7525, 2146.8265, 2119.2...
## $ TIME_TO_DOM_INTERACTIVE_MS        <dbl> 6157.8399, 1155.0503, 1819.6...
## $ TIME_TO_NON_BLANK_PAINT_MS        <dbl> 5198.1300, 1015.9784, 1979.7...
## $ profile_age_cat                   <ord> < 5 years, < 5 years, < 5 ye...
## $ distro_id_norm                    <fct> Mozilla, Mozilla, Mozilla, M...
## $ timezone_cat                      <fct> "(-6,-4]", "(0,2]", "(0,2]",...
## $ memory_cat                        <ord> < 4GB, < 4GB, < 16GB, < 16GB...
## $ cpu_speed_cat                     <ord> < 3GHz, < 3GHz, < 3GHz, < 4G...
## $ cpu_cores_cat                     <ord> 2, 2, < 4, < 4, 2, < 4, 2, 2...
## $ is_release                        <lgl> FALSE, FALSE, FALSE, FALSE, ...
## $ cpu_l2_cache_kb_cat               <fct> < 256, < 256, < 256, < 256, ...

If we want to get some metrics about data types, zeros, infinite numbers, and missing values, we can use the df_status function.

kable(df_status(df_train_f, FALSE)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")
variable q_zeros p_zeros q_na p_na q_inf p_inf type unique
client_id 0 0.00 0 0 0 0 character 302805
num_active_days 0 0.00 0 0 0 0 integer 8
content_crashes 302819 100.00 0 0 0 0 integer 1
active_hours 7 0.00 0 0 0 0 numeric 180711
uri_count 0 0.00 0 0 0 0 numeric 18610
session_length 0 0.00 0 0 0 0 numeric 289472
search_count 80328 26.53 0 0 0 0 numeric 930
num_bookmarks 472 0.16 0 0 0 0 numeric 19727
num_pages 7 0.00 0 0 0 0 numeric 179391
daily_unique_domains 0 0.00 0 0 0 0 numeric 66058
daily_max_tabs 0 0.00 0 0 0 0 numeric 2888
daily_tabs_opened 0 0.00 0 0 0 0 numeric 4645
startup_ms 0 0.00 0 0 0 0 numeric 276270
daily_num_sessions_started 3100 1.02 0 0 0 0 numeric 654
active_hours_max 7 0.00 0 0 0 0 numeric 39216
uri_count_max 0 0.00 0 0 0 0 integer 3642
session_length_max 0 0.00 0 0 0 0 numeric 188078
search_count_max 80328 26.53 0 0 0 0 integer 140
num_pages_max 7 0.00 0 0 0 0 numeric 76011
daily_unique_domains_max 0 0.00 0 0 0 0 numeric 1996
daily_max_tabs_max 0 0.00 0 0 0 0 integer 541
daily_tabs_opened_max 0 0.00 0 0 0 0 integer 853
startup_ms_max 0 0.00 0 0 0 0 numeric 145192
daily_num_sessions_started_max 3100 1.02 0 0 0 0 integer 85
label 0 0.00 0 0 0 0 factor 2
install_year 0 0.00 0 0 0 0 numeric 21
profile_age 6320 2.09 0 0 0 0 numeric 4127
fxa_configured 0 0.00 0 0 0 0 factor 2
sync_configured 0 0.00 0 0 0 0 factor 2
is_default_browser 0 0.00 0 0 0 0 factor 2
locale 0 0.00 0 0 0 0 factor 2
normalized_channel 0 0.00 0 0 0 0 factor 2
app_version 0 0.00 0 0 0 0 numeric 1
default_search_engine 0 0.00 0 0 0 0 factor 6
country 0 0.00 0 0 0 0 factor 2
timezone_offset 736 0.24 0 0 0 0 integer 35
num_addons 53 0.02 0 0 0 0 numeric 2124
cpu_cores 0 0.00 0 0 0 0 numeric 27
cpu_speed_mhz 0 0.00 0 0 0 0 numeric 1232
cpu_l2_cache_kb 0 0.00 0 0 0 0 numeric 8
cpu_vendor 0 0.00 0 0 0 0 factor 3
memory_mb 0 0.00 0 0 0 0 integer 5893
os_version 0 0.00 0 0 0 0 ordered-factor 5
is_wow64 0 0.00 0 0 0 0 factor 2
FX_PAGE_LOAD_MS_2_PARENT 0 0.00 0 0 0 0 numeric 294778
TIME_TO_DOM_COMPLETE_MS 0 0.00 0 0 0 0 numeric 300964
TIME_TO_DOM_CONTENT_LOADED_END_MS 0 0.00 0 0 0 0 numeric 300503
TIME_TO_LOAD_EVENT_END_MS 0 0.00 0 0 0 0 numeric 301045
TIME_TO_DOM_INTERACTIVE_MS 0 0.00 0 0 0 0 numeric 300036
TIME_TO_NON_BLANK_PAINT_MS 0 0.00 0 0 0 0 numeric 296802
profile_age_cat 0 0.00 0 0 0 0 ordered-factor 6
distro_id_norm 0 0.00 0 0 0 0 factor 4
timezone_cat 0 0.00 0 0 0 0 factor 13
memory_cat 0 0.00 0 0 0 0 ordered-factor 6
cpu_speed_cat 0 0.00 0 0 0 0 ordered-factor 5
cpu_cores_cat 0 0.00 0 0 0 0 ordered-factor 6
is_release 59627 19.69 0 0 0 0 logical 2
cpu_l2_cache_kb_cat 0 0.00 0 0 0 0 factor 4
  • q_zeros: quantity of zeros (p_zeros: in percent)
  • q_inf: quantity of infinite values (p_inf: in percent)
  • q_na: quantity of NA (p_na: in percent)
  • type: factor, ordered-factor, numeric, integer or character
  • unique: quantity of unique values

Validation

Let’s use glimpse function to display a vertical preview of the validation dataset.

glimpse(df_validate_f)
## Observations: 328,042
## Variables: 58
## $ client_id                         <chr> "001cf926-92e3-4587-887e-d31...
## $ num_active_days                   <int> 8, 2, 2, 3, 3, 6, 1, 4, 7, 4...
## $ content_crashes                   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ active_hours                      <dbl> 1.53697917, 0.18333333, 0.24...
## $ uri_count                         <dbl> 71.125000, 43.500000, 89.000...
## $ session_length                    <dbl> 23.6797916, 0.5619445, 6.629...
## $ search_count                      <dbl> 3.3750000, 1.5000000, 0.0000...
## $ num_bookmarks                     <dbl> 11.0, 259.0, 15.0, 11.0, 7.0...
## $ num_pages                         <dbl> 1890.5714, 22005.0000, 7203....
## $ daily_unique_domains              <dbl> 9.216667, 5.000000, 1.000000...
## $ daily_max_tabs                    <dbl> 8.500000, 3.000000, 6.500000...
## $ daily_tabs_opened                 <dbl> 16.625, 6.000, 9.500, 1.000,...
## $ startup_ms                        <dbl> 9928.4833, 5413.5000, 7041.6...
## $ daily_num_sessions_started        <dbl> 0.6250000, 1.0000000, 1.0000...
## $ active_hours_max                  <dbl> 2.32500000, 0.21527778, 0.32...
## $ uri_count_max                     <int> 120, 48, 99, 53, 19, 33, 32,...
## $ session_length_max                <dbl> 34.020000, 0.781111, 9.95972...
## $ search_count_max                  <int> 6, 2, 0, 0, 0, 2, 0, 5, 1, 1...
## $ num_pages_max                     <dbl> 2094, 22005, 7203, 2112, 18,...
## $ daily_unique_domains_max          <dbl> 20.000000, 7.000000, 1.00000...
## $ daily_max_tabs_max                <int> 14, 4, 11, 2, 2, 6, 8, 3, 6,...
## $ daily_tabs_opened_max             <int> 31, 7, 12, 1, 2, 7, 13, 71, ...
## $ startup_ms_max                    <dbl> 17491.667, 9579.000, 9194.33...
## $ daily_num_sessions_started_max    <int> 3, 1, 2, 2, 1, 2, 2, 2, 2, 1...
## $ label                             <fct> beta, beta, beta, beta, beta...
## $ install_year                      <dbl> 2016, 2016, 2018, 2013, 2017...
## $ profile_age                       <dbl> 1204, 1124, 336, 1606, 6, 17...
## $ fxa_configured                    <fct> False, False, False, False, ...
## $ sync_configured                   <fct> False, False, False, False, ...
## $ is_default_browser                <fct> True, False, False, True, Fa...
## $ locale                            <fct> en-US, en-US, en-US, en-US, ...
## $ normalized_channel                <fct> beta, beta, beta, beta, beta...
## $ app_version                       <dbl> 68, 68, 68, 68, 68, 68, 68, ...
## $ default_search_engine             <fct> DuckDuckGo, DuckDuckGo, Goog...
## $ country                           <fct> US, GB, US, GB, US, US, US, ...
## $ timezone_offset                   <int> -240, 60, 360, 360, -420, -2...
## $ num_addons                        <dbl> 7.0, 5.0, 6.0, 6.0, 5.0, 5.0...
## $ cpu_cores                         <dbl> 2, 2, 2, 2, 4, 4, 1, 2, 4, 3...
## $ cpu_speed_mhz                     <dbl> 2527, 2394, 2659, 2594, 3093...
## $ cpu_l2_cache_kb                   <dbl> 256, 256, 3072, 256, 256, 25...
## $ cpu_vendor                        <fct> Intel, Intel, Intel, Intel, ...
## $ memory_mb                         <int> 4022, 3810, 3317, 3965, 1627...
## $ os_version                        <ord> 6.1, 6.1, 10.0, 6.2, 6.3, 10...
## $ is_wow64                          <fct> False, False, False, False, ...
## $ FX_PAGE_LOAD_MS_2_PARENT          <dbl> 3133.947, 3226.048, 4400.155...
## $ TIME_TO_DOM_COMPLETE_MS           <dbl> 3713.308, 2561.596, 7244.930...
## $ TIME_TO_DOM_CONTENT_LOADED_END_MS <dbl> 3748.4715, 1346.8361, 3711.4...
## $ TIME_TO_LOAD_EVENT_END_MS         <dbl> 3730.944, 2523.810, 7280.457...
## $ TIME_TO_DOM_INTERACTIVE_MS        <dbl> 2444.7985, 1385.3500, 3929.2...
## $ TIME_TO_NON_BLANK_PAINT_MS        <dbl> 1972.6632, 935.6222, 1474.56...
## $ profile_age_cat                   <ord> < 5 years, < 5 years, < 2 ye...
## $ distro_id_norm                    <fct> Mozilla, Mozilla, Mozilla, M...
## $ timezone_cat                      <fct> "(-6,-4]", "(0,2]", "(4,6]",...
## $ memory_cat                        <ord> < 4GB, < 4GB, < 4GB, < 4GB, ...
## $ cpu_speed_cat                     <ord> < 3GHz, < 3GHz, < 3GHz, < 3G...
## $ cpu_cores_cat                     <ord> 2, 2, 2, 2, < 4, < 4, 1, 2, ...
## $ is_release                        <lgl> FALSE, FALSE, FALSE, FALSE, ...
## $ cpu_l2_cache_kb_cat               <fct> < 256, < 256, > 1024, < 256,...
kable(df_status(df_train_f, FALSE)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")
variable q_zeros p_zeros q_na p_na q_inf p_inf type unique
client_id 0 0.00 0 0 0 0 character 302805
num_active_days 0 0.00 0 0 0 0 integer 8
content_crashes 302819 100.00 0 0 0 0 integer 1
active_hours 7 0.00 0 0 0 0 numeric 180711
uri_count 0 0.00 0 0 0 0 numeric 18610
session_length 0 0.00 0 0 0 0 numeric 289472
search_count 80328 26.53 0 0 0 0 numeric 930
num_bookmarks 472 0.16 0 0 0 0 numeric 19727
num_pages 7 0.00 0 0 0 0 numeric 179391
daily_unique_domains 0 0.00 0 0 0 0 numeric 66058
daily_max_tabs 0 0.00 0 0 0 0 numeric 2888
daily_tabs_opened 0 0.00 0 0 0 0 numeric 4645
startup_ms 0 0.00 0 0 0 0 numeric 276270
daily_num_sessions_started 3100 1.02 0 0 0 0 numeric 654
active_hours_max 7 0.00 0 0 0 0 numeric 39216
uri_count_max 0 0.00 0 0 0 0 integer 3642
session_length_max 0 0.00 0 0 0 0 numeric 188078
search_count_max 80328 26.53 0 0 0 0 integer 140
num_pages_max 7 0.00 0 0 0 0 numeric 76011
daily_unique_domains_max 0 0.00 0 0 0 0 numeric 1996
daily_max_tabs_max 0 0.00 0 0 0 0 integer 541
daily_tabs_opened_max 0 0.00 0 0 0 0 integer 853
startup_ms_max 0 0.00 0 0 0 0 numeric 145192
daily_num_sessions_started_max 3100 1.02 0 0 0 0 integer 85
label 0 0.00 0 0 0 0 factor 2
install_year 0 0.00 0 0 0 0 numeric 21
profile_age 6320 2.09 0 0 0 0 numeric 4127
fxa_configured 0 0.00 0 0 0 0 factor 2
sync_configured 0 0.00 0 0 0 0 factor 2
is_default_browser 0 0.00 0 0 0 0 factor 2
locale 0 0.00 0 0 0 0 factor 2
normalized_channel 0 0.00 0 0 0 0 factor 2
app_version 0 0.00 0 0 0 0 numeric 1
default_search_engine 0 0.00 0 0 0 0 factor 6
country 0 0.00 0 0 0 0 factor 2
timezone_offset 736 0.24 0 0 0 0 integer 35
num_addons 53 0.02 0 0 0 0 numeric 2124
cpu_cores 0 0.00 0 0 0 0 numeric 27
cpu_speed_mhz 0 0.00 0 0 0 0 numeric 1232
cpu_l2_cache_kb 0 0.00 0 0 0 0 numeric 8
cpu_vendor 0 0.00 0 0 0 0 factor 3
memory_mb 0 0.00 0 0 0 0 integer 5893
os_version 0 0.00 0 0 0 0 ordered-factor 5
is_wow64 0 0.00 0 0 0 0 factor 2
FX_PAGE_LOAD_MS_2_PARENT 0 0.00 0 0 0 0 numeric 294778
TIME_TO_DOM_COMPLETE_MS 0 0.00 0 0 0 0 numeric 300964
TIME_TO_DOM_CONTENT_LOADED_END_MS 0 0.00 0 0 0 0 numeric 300503
TIME_TO_LOAD_EVENT_END_MS 0 0.00 0 0 0 0 numeric 301045
TIME_TO_DOM_INTERACTIVE_MS 0 0.00 0 0 0 0 numeric 300036
TIME_TO_NON_BLANK_PAINT_MS 0 0.00 0 0 0 0 numeric 296802
profile_age_cat 0 0.00 0 0 0 0 ordered-factor 6
distro_id_norm 0 0.00 0 0 0 0 factor 4
timezone_cat 0 0.00 0 0 0 0 factor 13
memory_cat 0 0.00 0 0 0 0 ordered-factor 6
cpu_speed_cat 0 0.00 0 0 0 0 ordered-factor 5
cpu_cores_cat 0 0.00 0 0 0 0 ordered-factor 6
is_release 59627 19.69 0 0 0 0 logical 2
cpu_l2_cache_kb_cat 0 0.00 0 0 0 0 factor 4

Observations

Are all the variables in the correct data type?

None. It seems that this has already been dealt with in preprocessing.

Any variables with lots of zeros?

Yes. Variables with lots of zeros may not be useful for modeling and, in some cases, they may dramatically bias the model. For example, the content_crashes is 100% equal to zero.

Any variables with lots of NAs?

None. Good news.

Any high cardinality variable?

Factor/categorical variables with a high number of different values (~30) tend to do overfitting if the categories have low cardinality.

Beta vs Release

## Training
df_release <- df_train_f[which(df_train_f$label == 'release'), ]
df_beta <- df_train_f[which(df_train_f$label == 'beta'), ]

## Validation
df_v_release <- df_validate_f[which(df_validate_f$label == 'release'), ]
df_v_beta <- df_validate_f[which(df_validate_f$label == 'beta'), ]

Training

  • Composition:
f <- freq(df_train_f$label)

Validation

  • Composition:
f <- freq(df_validate_f$label)

Analyzing Discrete Variables

Training

## Frequency distribution release dataframe
plot_bar(df_release, ggtheme = theme_minimal(base_size = 15), title = 'Release')

## Frequency distribution beta dataframe
plot_bar(df_beta, ggtheme = theme_minimal(base_size = 15), title = 'Beta')

Validation

## Frequency distribution release dataframe
plot_bar(df_v_release, ggtheme = theme_minimal(base_size = 15), title = 'Release')

## Frequency distribution beta dataframe
plot_bar(df_v_beta, ggtheme = theme_minimal(base_size = 15), title = 'Beta')

Analyzing Continuos Variables

Training

## View histogram of release dataset
plot_histogram(df_release, ggtheme = theme_minimal(base_size = 15), title = 'Release')

## View histogram of beta dataset
plot_histogram(df_beta, ggtheme = theme_minimal(base_size = 15), title = 'Beta')

Validation

## View histogram of release dataset
plot_histogram(df_release, ggtheme = theme_minimal(base_size = 15), title = 'Release')

## View histogram of beta dataset
plot_histogram(df_beta, ggtheme = theme_minimal(base_size = 15), title = 'Beta')


Observations

  • My first impression is that user engagement metrics might be a good path to follow

Ploting Density Curves

## Training
t <- ggplot(data=df_train_f, aes(x=uri_count, group=label, fill=label)) +
    geom_density(adjust=1.5, alpha=0.6) + xlim(0, 1000) +
    scale_fill_viridis(discrete=TRUE) +
    scale_color_viridis(discrete=TRUE) +
    labs(x="URI Count", y = "Density") +
    theme_ipsum()

## Validation
v <- ggplot(data=df_validate_f, aes(x=uri_count, group=label, fill=label)) +
    geom_density(adjust=1.5, alpha=0.6) + xlim(0, 1000) +
    scale_fill_viridis(discrete=TRUE) +
    scale_color_viridis(discrete=TRUE) +
    labs(x="URI Count", y = "Density") +
    theme_ipsum()

plot_grid(t, v, ncol=2, labels = c("Train", "Validate")) ## Set up a 2 x 2 plotting space

## Training
t <- ggplot(data=df_train_f, aes(x=active_hours, group=label, fill=label)) +
    geom_density(adjust=1.5, alpha=0.6) +
    scale_fill_viridis(discrete=TRUE) +
    scale_color_viridis(discrete=TRUE) +
    labs(x="Active Hours", y = "Density") +
    theme_ipsum()

## Validation
v <- ggplot(data=df_validate_f, aes(x=active_hours, group=label, fill=label)) +
    geom_density(adjust=1.5, alpha=0.6) +
    scale_fill_viridis(discrete=TRUE) +
    scale_color_viridis(discrete=TRUE) +
    labs(x="Active Hours", y = "Density") +
    theme_ipsum()

plot_grid(t, v, ncol=2, labels = c("Train", "Validate")) ## Set up a 2 x 2 plotting space

## Training
t <- ggplot(data=df_train_f, aes(x=num_pages, group=label, fill=label)) +
    geom_density(adjust=1.5, alpha=0.6) + xlim(0, 50000) +
    scale_fill_viridis(discrete=TRUE) +
    scale_color_viridis(discrete=TRUE) +
    labs(x="Num Pages", y = "Density") +
    theme_ipsum()

## Validation
v <- ggplot(data=df_validate_f, aes(x=num_pages, group=label, fill=label)) +
    geom_density(adjust=1.5, alpha=0.6) + xlim(0, 50000) +
    scale_fill_viridis(discrete=TRUE) +
    scale_color_viridis(discrete=TRUE) +
    labs(x="Num Pages", y = "Density") +
    theme_ipsum()

plot_grid(t, v, ncol=2, labels = c("Train", "Validate")) ## Set up a 2 x 2 plotting space

## Training
t <- ggplot(data=df_train_f, aes(x=session_length, group=label, fill=label)) +
    geom_density(adjust=1.5, alpha=0.6) + xlim(0, 75) +
    scale_fill_viridis(discrete=TRUE) +
    scale_color_viridis(discrete=TRUE) +
    labs(x="Session Length", y = "Density") +
    theme_ipsum()

## Validation
v <- ggplot(data=df_validate_f, aes(x=session_length, group=label, fill=label)) +
    geom_density(adjust=1.5, alpha=0.6) + xlim(0, 75) +
    scale_fill_viridis(discrete=TRUE) +
    scale_color_viridis(discrete=TRUE) +
    labs(x="Session Length", y = "Density") +
    theme_ipsum()

plot_grid(t, v, ncol=2, labels = c("Train", "Validate")) ## Set up a 2 x 2 plotting space

User Engagement Metrics

Continuous Variables

This section will focus only on user engagement continuous metrics. So, we are going to analyze the following metrics:

  • num_active_days
  • active_hours
  • active_hours_max
  • uri_count
  • uri_count_max
  • session_length
  • session_length_max
  • search_count
  • search_count_max
  • num_bookmarks
  • num_pages
  • num_pages_max
  • num_addons
  • daily_unique_domains
  • daily_unique_domains_max
  • daily_max_tabs
  • daily_max_tabs_max
  • daily_tabs_opened
  • daily_tabs_opened_max
  • daily_num_sessions_started
  • daily_num_sessions_started_max
  • startup_ms
  • install_year
  • profile_age
  • timezone_offset
  • memory_mb
  • cpu_cores
  • cpu_speed_mhz
  • cpu_l2_cache_kb

Training

Beta-Release Difference

kable(text_tbl) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")
beta_num_active_days release_num_active_days beta_active_hours release_active_hours beta_active_hours_max release_active_hours_max beta_uri_count release_uri_count beta_uri_count_max release_uri_count_max beta_session_length release_session_length beta_session_length_max release_session_length_max beta_search_count release_search_count beta_search_count_max release_search_count_max beta_num_bookmarks release_num_bookmarks beta_num_pages release_num_pages beta_num_pages_max release_num_pages_max beta_daily_unique_domains release_daily_unique_domains beta_daily_max_tabs release_daily_max_tabs beta_daily_tabs_opened release_daily_tabs_opened beta_daily_num_sessions_started release_daily_num_sessions_started beta_daily_unique_domains_max release_daily_unique_domains_max beta_daily_max_tabs_max release_daily_max_tabs_max beta_daily_tabs_opened_max release_daily_tabs_opened_max beta_daily_num_sessions_started_max release_daily_num_sessions_started_max beta_startup_ms release_startup_ms beta_install_year release_install_year beta_profile_age release_profile_age beta_timezone_offset release_timezone_offset beta_memory_mb release_memory_mb beta_cpu_cores release_cpu_cores beta_cpu_speed_mhz release_cpu_speed_mhz beta_cpu_l2_cache_kb release_cpu_l2_cache_kb
Min. 1.000000 1.000000 0.0000000 0.0000000 0.000000 0.0000000 1.00000 1.00000 1.0000 1.0000 0.0166665 0.0192595 0.021389 0.030556 0.0000000 0.000000 0.000000 0.000000 0.0000 0.0000 1.000 0.000 1.00 0.00 1.000000 1.000000 1.000000 0.625000 1.00000 1.000000 0.000000 0.000000 1.00000 1.000000 1.00000 1.000000 1.00000 1.00000 0.000000 0.000000 269.000 261.128 1993.000 2000.000 0.0000 0.0000 -720.000 -720.000 511.000 512.000 1.000000 1.000000 798.000 792.00 128.0000 128.0000
1st Qu. 4.000000 4.000000 0.2250000 0.2686111 0.450000 0.5402778 37.00000 44.33333 68.0000 86.0000 2.5284028 2.1590431 4.877777 4.408542 0.0000000 0.000000 0.000000 0.000000 10.0000 10.0000 686.000 1022.125 785.00 1142.00 2.166667 2.283333 2.600000 2.500000 4.00000 4.000000 1.000000 1.250000 3.00000 3.125000 4.00000 4.000000 6.00000 7.00000 2.000000 2.000000 2102.206 1432.677 2017.000 2016.000 271.0000 257.0000 -300.000 -300.000 3984.000 4011.000 2.000000 2.000000 2200.000 2261.00 256.0000 256.0000
Median 6.000000 6.000000 0.5309524 0.5744444 1.063889 1.1541667 86.66667 96.66667 172.0000 196.0000 7.7105554 6.3351191 14.808889 11.701666 0.8333333 0.875000 2.000000 2.000000 26.0000 26.0000 4185.667 5536.000 4340.00 5705.50 3.562500 3.600000 4.250000 3.714286 9.00000 8.833333 1.666667 2.000000 5.50000 6.000000 6.00000 6.000000 17.00000 17.00000 3.000000 4.000000 5088.010 3231.339 2018.000 2018.000 711.0000 698.0000 -240.000 -240.000 8031.000 8069.000 2.000000 2.000000 2594.000 2712.00 256.0000 256.0000
Mean 5.346169 5.569842 0.8236611 0.8468557 1.577508 1.6251135 152.74550 156.24224 311.0213 321.3891 12.2961990 9.2821806 22.706568 18.210749 2.4506498 2.376504 5.636171 5.434352 242.4878 158.9390 17363.463 17330.600 17558.93 17518.75 5.060464 4.968328 9.603628 6.200080 20.49191 17.092979 2.368895 2.888602 8.74361 8.552061 13.81149 9.317556 39.64786 33.27059 4.281399 5.248573 25835.963 9832.051 2017.138 2017.064 893.7534 894.7365 -143.855 -238.714 8965.156 9443.657 2.975699 3.143089 2678.209 2710.62 679.9325 625.9611
3rd Qu. 8.000000 8.000000 1.1028646 1.1265956 2.165278 2.1902778 188.50000 197.00000 382.0000 400.0000 19.5792560 13.6608531 31.519026 26.128403 2.8333333 3.000000 7.000000 7.000000 96.0000 85.2125 18605.464 19680.625 18885.50 19922.00 6.166667 6.070833 8.000000 6.000000 21.75000 19.166667 2.875000 3.500000 11.00000 11.000000 12.00000 9.000000 42.00000 38.00000 5.000000 6.000000 12618.764 8394.891 2018.000 2018.000 1354.0000 1374.0000 60.000 -240.000 10238.000 12144.000 4.000000 4.000000 3192.000 3193.00 512.0000 512.0000
Max. 8.000000 8.000000 7.2901042 7.1222222 24.983333 23.9666667 2931.00000 2391.25000 15626.0000 18032.0000 240.8048605 91.0663890 1255.382223 384.288333 51.0000000 45.750000 188.000000 217.000000 40401.0000 18632.0000 179657.500 168416.286 180456.00 172543.00 44.000000 39.375000 1012.625000 445.375000 518.25000 347.500000 32.833333 32.250000 100.00000 100.000000 3149.00000 2425.000000 3302.00000 2410.00000 88.000000 100.000000 17109505.514 5358122.833 2019.000 2019.000 7051.0000 6922.0000 840.000 720.000 262078.000 524254.000 36.000000 40.000000 37214.000 15077.00 6144.0000 6144.0000

Validation

Beta-Release Difference

kable(text_tbl_v) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")
beta_num_active_days release_num_active_days beta_active_hours release_active_hours beta_active_hours_max release_active_hours_max beta_uri_count release_uri_count beta_uri_count_max release_uri_count_max beta_session_length release_session_length beta_session_length_max release_session_length_max beta_search_count release_search_count beta_search_count_max release_search_count_max beta_num_bookmarks release_num_bookmarks beta_num_pages release_num_pages beta_num_pages_max release_num_pages_max beta_daily_unique_domains release_daily_unique_domains beta_daily_max_tabs release_daily_max_tabs beta_daily_tabs_opened release_daily_tabs_opened beta_daily_num_sessions_started release_daily_num_sessions_started beta_daily_unique_domains_max release_daily_unique_domains_max beta_daily_max_tabs_max release_daily_max_tabs_max beta_daily_tabs_opened_max release_daily_tabs_opened_max beta_daily_num_sessions_started_max release_daily_num_sessions_started_max beta_startup_ms release_startup_ms beta_install_year release_install_year beta_profile_age release_profile_age beta_timezone_offset release_timezone_offset beta_memory_mb release_memory_mb beta_cpu_cores release_cpu_cores beta_cpu_speed_mhz release_cpu_speed_mhz beta_cpu_l2_cache_kb release_cpu_l2_cache_kb
Min. 1.000000 1.000000 0.0000000 0.0000000 0.0000000 0.000000 1.0000 1.00000 1.0000 1.0000 0.0199998 0.0157222 0.041111 0.019722 0.000000 0.000000 0.000000 0.000000 0.0000 0.00000 0.0000 0.0000 0.00 0.00 1.000000 1.000000 0.400000 0.5714286 1.00000 1.000000 0.000000 0.000000 1.00000 1.000000 1.00000 1.000000 1.00000 1.00000 0.000000 0.000000 289.000 2.388333e+02 2000.000 2000.000 0.0000 0.000 -720.0000 -720.0000 511.000 512.000 1.000000 1.000000 633.00 768.000 128.0000 128.0000
1st Qu. 3.000000 4.000000 0.2074074 0.2640873 0.3916667 0.537500 33.2500 44.00000 57.0000 86.0000 2.2133335 2.3199533 4.002778 4.788333 0.000000 0.000000 0.000000 0.000000 9.0000 10.00000 543.3333 991.3333 620.00 1113.00 2.125000 2.287037 2.500000 2.5000000 3.50000 4.000000 1.000000 1.166667 3.00000 3.200000 4.00000 4.000000 5.00000 7.00000 1.000000 2.000000 2218.863 1.567167e+03 2017.000 2017.000 213.0000 235.000 -300.0000 -300.0000 3981.000 4021.000 2.000000 2.000000 2195.00 2261.000 256.0000 256.0000
Median 5.000000 6.000000 0.5027778 0.5751736 0.9625000 1.165278 80.5000 97.42857 152.0000 199.0000 7.1543749 6.7808331 12.961389 12.802778 0.750000 1.000000 2.000000 3.000000 23.0000 25.33333 3347.5000 5308.0000 3513.00 5490.00 3.500000 3.651190 4.125000 3.8000000 8.50000 8.857143 1.666667 2.000000 5.20000 6.000000 6.00000 6.000000 15.00000 17.00000 3.000000 4.000000 5015.522 3.345629e+03 2018.000 2018.000 690.0000 673.000 -240.0000 -240.0000 7973.000 8073.000 2.000000 3.000000 2594.00 2712.000 256.0000 256.0000
Mean 4.912574 5.710307 0.7988445 0.8524956 1.4711888 1.636961 146.3339 158.71931 287.3003 328.3892 12.3367205 9.7067904 22.272966 18.614122 2.324319 2.446479 5.100206 5.633558 225.4153 158.03362 15614.0379 17089.9304 15779.79 17289.08 5.148112 5.112258 9.019717 6.3471824 20.03166 17.187064 2.398131 2.831202 8.58199 8.837358 12.82845 9.539785 37.29553 33.54423 4.141417 5.180103 50072.589 2.727928e+04 2017.255 2017.194 875.2575 883.857 -129.9082 -240.4712 8795.994 9719.802 2.954155 3.191904 2656.31 2712.603 674.5779 610.7777
3rd Qu. 7.000000 8.000000 1.0600694 1.1399306 2.0013889 2.220833 179.0000 200.14286 352.0000 409.0000 18.7390970 14.7740975 28.951388 27.097500 2.666667 3.000000 6.000000 7.000000 85.0000 84.00000 15660.6250 19359.8333 15872.00 19606.00 6.166667 6.250000 7.750000 6.2857143 21.00000 19.500000 3.000000 3.375000 10.50000 11.000000 11.00000 10.000000 39.00000 38.00000 5.000000 6.000000 11210.000 7.770437e+03 2019.000 2019.000 1329.0000 1368.000 60.0000 -240.0000 8189.000 12180.000 4.000000 4.000000 3192.00 3193.000 512.0000 512.0000
Max. 8.000000 8.000000 7.5402778 7.2204861 31.1277778 25.440278 2983.0000 2483.16667 17548.0000 18524.0000 286.6983330 90.4422220 922.285000 524.545556 50.000000 45.000000 313.000000 208.000000 39519.0000 20002.14286 177583.0000 168812.1429 182555.00 170532.00 49.291667 42.400000 910.400000 449.3333333 554.00000 357.000000 32.000000 32.250000 100.00000 100.000000 1779.00000 2215.000000 2551.00000 2342.00000 110.000000 184.000000 50660199.500 2.259481e+07 2019.000 2019.000 7095.0000 6972.000 840.0000 780.0000 294902.000 1572801.000 32.000000 50.000000 37221.00 28900.000 6144.0000 6144.0000

QQ-Plots

par(mfrow = c(4, 2))  ## Set up a 2 x 3 plotting space

## QQ plot in R to compare two data samples
for (i in user_eng) {
  
  # Training
  x_t <- df_beta_ue[,i]
  y_t <- df_release_ue[,i]
  
  rg_t <- range(x_t, y_t, na.rm=T)
  
  test_t <- ks.test(x_t, y_t)$statistic
  test_t <- paste("KS Test = ", round(test_t, 3))
  
  # Validation
  
  x_v <- df_beta_v_ue[,i]
  y_v <- df_release_v_ue[,i]
  
  rg_v <- range(x_v, y_v, na.rm=T)
  
  test_v <- ks.test(x_v, y_v)$statistic
  test_v <- paste("KS Test = ", round(test_v, 3))
  
  ########
  
  title_t <- paste('V67', i, sep='\n')
  qqplot(x_t, y_t, main=title_t, xlim=rg_t, ylim=rg_t, xlab = "Beta", ylab = "Release", pch = 1)
  # mtext(test, side=3)
  text(min(x_t), (if(max(x_t) > max(y_t)) max(x_t) else max(y_t)), test_t, adj=c(0,1))
  abline(0,1, col="#fe346e", lty=2) 
  
  title_v <- paste('V68', i, sep='\n')
  qqplot(x_v, y_v, main=title_v, xlim=rg_v, ylim=rg_v, xlab = "Beta", ylab = "Release", pch = 1)
  # mtext(test, side=3)
  text(min(x_v), (if(max(x_v) > max(y_v)) max(x_v) else max(y_v)), test_v, adj=c(0,1))
  abline(0,1, col="#fe346e", lty=2)  
}

Observations

kable(tbl_ks) %>%
add_header_above(c("KS Distance" = 3)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
row_spec(c(2,18,21:26), bold = T, color = "white", background = "#c3f584")
KS Distance
training validation
num_active_days 0.071 0.166
active_hours 0.045 0.058
active_hours_max 0.044 0.077
uri_count 0.047 0.071
uri_count_max 0.054 0.095
session_length 0.099 0.076
session_length_max 0.073 0.040
search_count 0.014 0.045
search_count_max 0.019 0.059
num_bookmarks 0.026 0.042
num_pages 0.055 0.082
num_pages_max 0.054 0.082
num_addons 0.601 0.292
daily_unique_domains 0.029 0.040
daily_unique_domains_max 0.033 0.054
daily_max_tabs 0.094 0.075
daily_max_tabs_max 0.080 0.051
daily_tabs_opened 0.035 0.036
daily_tabs_opened_max 0.032 0.053
daily_num_sessions_started 0.119 0.098
daily_num_sessions_started_max 0.116 0.117
startup_ms 0.129 0.130
install_year 0.018 0.017
profile_age 0.042 0.035
timezone_offset 0.208 0.235
memory_mb 0.075 0.109
cpu_cores 0.064 0.087
cpu_speed_mhz 0.046 0.062
cpu_l2_cache_kb 0.025 0.035
  • Analyzing at the plots of each dataset separately, in general, Beta-Release distributions are very similar to each other
  • For instance, metrics related to active hours and number of active days, search count, number of pages, daily unique domains and daily number of sessions started
  • Comparing only the training and validation plots, we can observe large differences between the variables. Only a few showed similar distances (marked in green)

Violin Plots

The following violin plots depicts distributions for the beta and release subsets, for both versions v67 and v68. Violin plot is a powerful data visualization technique since it allows to compare both the ranking of several groups and their distribution.

NOTE: Guiding lines have been added for the following:

  • red dashed: Release mean
  • blue dashed: Beta mean
  • red solid: Release median
  • blue solid: Beta median
## Violin plots
for (i in user_eng) {
  df_validate_means <- df_validate_ue %>%
  group_by(label) %>%
  summarise(value = mean(eval(as.name(i))))

  df_validate_medians <- df_validate_ue %>%
  group_by(label) %>%
  summarise(value = median(eval(as.name(i))))
  
  df_train_means <- df_train_ue %>%
  group_by(label) %>%
  summarise(value = mean(eval(as.name(i))))

  df_train_medians <- df_train_ue %>%
  group_by(label) %>%
  summarise(value = median(eval(as.name(i))))
  
  ###########
  
  violin_train <- ggplot(df_train_ue, aes(x=label, y=eval(as.name(i)), fill=label)) + 
  geom_violin(trim=FALSE) +
  labs(title=i,x="Channel", y = "Measure") + 
  scale_fill_manual(values=c("#111d5e", "#b21f66")) +
  geom_hline(
    data = df_train_means,
    aes(yintercept = value, colour = label),
    linetype = "dashed",
    size = 1
  ) +
  geom_hline(
    data = df_train_medians,
    aes(yintercept = value, colour = label),
    linetype = "solid",
    size = 1
  ) + scale_colour_manual(values=c("blue", "red")) +
  geom_boxplot(width=0.1, fill="white", alpha=0.5) + 
  theme_minimal() + theme(legend.position="none") + coord_flip()
  
  violin_valid <- ggplot(df_validate_ue, aes(x=label, y=eval(as.name(i)), fill=label)) + 
  geom_violin(trim=FALSE) +
  labs(title=i,x="Channel", y = "Measure") + 
  scale_fill_manual(values=c("#111d5e", "#b21f66")) +
  geom_hline(
    data = df_validate_means,
    aes(yintercept = value, colour = label),
    linetype = "dashed",
    size = 1
  ) +
  geom_hline(
    data = df_validate_medians,
    aes(yintercept = value, colour = label),
    linetype = "solid",
    size = 1
  ) + scale_colour_manual(values=c("blue", "red")) +
  geom_boxplot(width=0.1, fill="white", alpha=0.5) + 
  theme_minimal() + theme(legend.position="none") + coord_flip()

  print(plot_grid(violin_train, violin_valid, ncol = 1,labels = c('V67','V68')))
}

Observations

The violin plots above show the relationship of channel type to user engagement metrics. Overall, the results were quite similar comparing the two versions (v67 and v68). Only the following metrics yielded different results:

  • active_hours_max
  • daily_unique_domains_max

Now, let’s take a closer look at the similar comparative results:

Which variables presented similar/equal means and medians for beta and release users?

  • uri_count
  • uri_count_max
  • session_length_max
  • search_count
  • search_count_max
  • daily_unique_domains
  • daily_tabs_opened_max
  • install_year
  • profile_age
  • memory_mb
  • cpu_speed_mhz

Which variables presented higher release users means and/or medians than betas?

  • num_active_days
  • active_hours
  • num_pages
  • num_pages_max
  • daily_num_sessions_started
  • daily_num_sessions_started_max
  • cpu_cores

Which variables presented lower release users means and/or medians than betas?

  • session_length
  • num_addons
  • daily_tabs_opened
  • timezone_offset
  • cpu_l2_cache_kb

Four variables became difficult to visualize through the violin plot. So we will plot ridgeline charts.

vlist <- c('num_bookmarks','daily_max_tabs','daily_max_tabs_max','startup_ms')

## Ridgeline  plots
for (i in vlist) {
  ridge_train <- ggplot(df_train_ue, aes(x = eval(as.name(i)), y = label, fill = label)) +
  geom_density_ridges(alpha=0.6, stat="binline", bins=20) +
  labs(title=i,x="Channel", y = "Measure") + 
  theme_ridges() + 
  scale_fill_manual(values=c("#111d5e", "#b21f66")) +
  theme(legend.position = "none")
  
  ridge_valid <- ggplot(df_validate_ue, aes(x = eval(as.name(i)), y = label, fill = label)) +
  geom_density_ridges(alpha=0.6, stat="binline", bins=20) +
  labs(title=i,x="Channel", y = "Measure") + 
  theme_ridges() + 
  scale_fill_manual(values=c("#111d5e", "#b21f66")) +
  theme(legend.position = "none")

  print(plot_grid(ridge_train, ridge_valid, ncol = 1,labels = c('V67','V68')))
}

Discrete Variables

This section will focus only on user engagement discrete metrics. So, we are going to analyze the following metrics:

  • default_search_engine
  • is_default_browser
  • profile_age_cat
  • distro_id_norm
  • memory_cat
  • cpu_speed_cat
  • cpu_cores_cat
  • cpu_l2_cache_kb_cat
  • cpu_vendor
  • os_version
  • is_wow64
  • fxa_configured
  • sync_configured
  • locale
  • country
  • timezone_cat
  • label
  • normalized_channel
  • is_release

Training

par(mfrow = c(2, 2))  ## Set up a 2 x 2 plotting space

## QQ plot in R to compare two data samples
for (i in user_eng_dis) {
  x <- df_beta_ue_dis[,i]
  y <- df_release_ue_dis[,i]
  
  rel_beta <- table(x)/nrow(df_beta_ue_dis) #divide the frequency counts by the total
  beta_bar <- barplot(rel_beta,
        main = "Beta", #Give your chart a title
        ylim=c(0,1), border=F, col = "#111d5e",
        xlab = i, #Label the x axis
        ylab = "Relative Frequency" #Label the y axis
  ) 
  # Add the text 
  text(beta_bar, rel_beta+0.025, paste(round(rel_beta*100), "%", sep="") ,cex=1) 
  
  rel_release <- table(y)/nrow(df_release_ue_dis) #divide the frequency counts by the total
  release_bar <- barplot(rel_release,
        main = "Release", #Give your chart a title
        ylim=c(0,1), border=F, col = "#b21f66",
        xlab = i, #Label the x axis
        ylab = "Relative Frequency" #Label the y axis
  )
  
  # Add the text 
  text(release_bar, rel_release+0.025, paste(round(rel_release*100), "%", sep="") ,cex=1) 
}

Validation

par(mfrow = c(2, 2))  ## Set up a 2 x 2 plotting space

## QQ plot in R to compare two data samples
for (i in user_eng_dis) {
  x <- df_beta_v_ue_dis[,i]
  y <- df_release_v_ue_dis[,i]
  
  rel_beta <- table(x)/nrow(df_beta_v_ue_dis) #divide the frequency counts by the total
  beta_bar <- barplot(rel_beta,
        main = "Beta", #Give your chart a title
        ylim=c(0,1), border=F, col = "#111d5e",
        xlab = i, #Label the x axis
        ylab = "Relative Frequency" #Label the y axis
  ) 
  # Add the text 
  text(beta_bar, rel_beta+0.025, paste(round(rel_beta*100), "%", sep="") ,cex=1) 
  
  rel_release <- table(y)/nrow(df_release_v_ue_dis) #divide the frequency counts by the total
  release_bar <- barplot(rel_release,
        main = "Release", #Give your chart a title
        ylim=c(0,1), border=F, col = "#b21f66",
        xlab = i, #Label the x axis
        ylab = "Relative Frequency" #Label the y axis
  )
  
  # Add the text 
  text(release_bar, rel_release+0.025, paste(round(rel_release*100), "%", sep="") ,cex=1) 
}


Observations

  • In general, the distributions are very similar to each other for both versions